filter_features <- function(df_train_sm){
df_fs <- df_train_sm %>%
select(-engagement) %>%
select(-content_crashes) %>%
select(-client_id) %>%
select(-label_beta) %>%
select(-label_release) %>%
select(-is_release) %>%
select(-app_version)
return(df_fs)
}This analysis is focused on utilizing Boruta as a initial pre-filter to the covariates, to narrow the feature selection search space.
Apply Boruta to each performance covariate.
engagement <- c('active_hours','active_hours_max','uri_count','uri_count_max','search_count','search_count_max','num_pages','num_pages_max','daily_max_tabs','daily_max_tabs_max','daily_unique_domains','daily_unique_domains_max','daily_tabs_opened','daily_tabs_opened_max')
set.seed(1234)
df_train_sm <- df_train_encoder %>% sample_n(10000)
df_fs <- filter_features(df_train_sm)## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(engagement)` instead of `engagement` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Boruta is a feature selection algorithm based on the random forest algorithm. In the process of deciding if a feature is important or not, some features may be marked as Tentative. Sometimes increasing the maxRuns can help resolve the Tentativeness of the feature.
boruta_results <- list()
for (metric in engagement){
print(paste('Applying Boruta to ', metric))
boruta.out <- Boruta(y = df_train_sm[[metric]], x=df_fs, doTrace=0)
boruta_results[[metric]] <- boruta.out
}## [1] "Applying Boruta to active_hours"
## [1] "Applying Boruta to active_hours_max"
## [1] "Applying Boruta to uri_count"
## [1] "Applying Boruta to uri_count_max"
## [1] "Applying Boruta to search_count"
## [1] "Applying Boruta to search_count_max"
## [1] "Applying Boruta to num_pages"
## [1] "Applying Boruta to num_pages_max"
## [1] "Applying Boruta to daily_max_tabs"
## [1] "Applying Boruta to daily_max_tabs_max"
## [1] "Applying Boruta to daily_unique_domains"
## [1] "Applying Boruta to daily_unique_domains_max"
## [1] "Applying Boruta to daily_tabs_opened"
## [1] "Applying Boruta to daily_tabs_opened_max"
for (metric in engagement){
plot(boruta_results[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
}Find the top 5 ranking features per metric, and add to a list.
features_top5 <- NULL
for(metric in engagement){
features_top5 <- c(names(sort(apply(boruta_results[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5)
}
top5 <- sort(c(unique(features_top5)))
x <- data.frame(top5)
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top5 |
|---|
| daily_num_sessions_started |
| daily_num_sessions_started_max |
| FX_PAGE_LOAD_MS_2_PARENT |
| fxa_configured_False |
| fxa_configured_True |
| memory_mb |
| num_active_days |
| num_addons |
| num_bookmarks |
| profile_age |
| profile_age_cat |
| session_length |
| session_length_max |
| TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_CONTENT_LOADED_END_MS |
| TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_LOAD_EVENT_END_MS |
| TIME_TO_NON_BLANK_PAINT_MS |
| timezone_cat_(0,2] |
Increasing to 10:
features_top10 <- NULL
for(metric in engagement){
features_top10 <- c(names(sort(apply(boruta_results[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10)
}
top10 <- sort(c(unique(features_top10)))
x <- data.frame(top10)
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top10 |
|---|
| country_GB |
| country_US |
| daily_num_sessions_started |
| daily_num_sessions_started_max |
| default_search_engine_other (non-bundled) |
| FX_PAGE_LOAD_MS_2_PARENT |
| fxa_configured_False |
| fxa_configured_True |
| memory_cat |
| memory_mb |
| num_active_days |
| num_addons |
| num_bookmarks |
| profile_age |
| profile_age_cat |
| session_length |
| session_length_max |
| startup_ms |
| startup_ms_max |
| sync_configured_False |
| sync_configured_True |
| TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_CONTENT_LOADED_END_MS |
| TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_LOAD_EVENT_END_MS |
| TIME_TO_NON_BLANK_PAINT_MS |
| timezone_cat_(0,2] |
Equalize by label, then perform the above.
df_beta <- df_train_encoder %>%
filter(label_beta == 1)
n_beta <- nrow(df_beta)
set.seed(1234)
df_rel <- df_train_encoder %>%
filter(label_beta == 0) %>%
sample_n(n_beta)
set.seed(1234)
df_train_f_sm_eq <- df_rel %>%
bind_rows(df_beta) %>%
sample_n(10000)
df_fs_eq <- df_train_f_sm_eq %>%
select(-engagement) %>%
select(-content_crashes) %>%
select(-client_id) %>%
select(-label_beta) %>%
select(-label_release) %>%
select(-is_release) %>%
select(-app_version)boruta_results_eq <- list()
for (metric in engagement){
print(paste('Applying Boruta to ', metric))
boruta.out <- Boruta(y = df_train_f_sm_eq[[metric]], x=df_fs_eq, doTrace=0)
boruta_results_eq[[metric]] <- boruta.out
}## [1] "Applying Boruta to active_hours"
## [1] "Applying Boruta to active_hours_max"
## [1] "Applying Boruta to uri_count"
## [1] "Applying Boruta to uri_count_max"
## [1] "Applying Boruta to search_count"
## [1] "Applying Boruta to search_count_max"
## [1] "Applying Boruta to num_pages"
## [1] "Applying Boruta to num_pages_max"
## [1] "Applying Boruta to daily_max_tabs"
## [1] "Applying Boruta to daily_max_tabs_max"
## [1] "Applying Boruta to daily_unique_domains"
## [1] "Applying Boruta to daily_unique_domains_max"
## [1] "Applying Boruta to daily_tabs_opened"
## [1] "Applying Boruta to daily_tabs_opened_max"
for (metric in names(boruta_results_eq)){
plot(boruta_results_eq[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
}Find the top 5 ranking features per metric, and add to a list.
features_top5 <- NULL
for(metric in names(boruta_results_eq)){
features_top5 <- c(names(sort(apply(boruta_results_eq[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5)
}
x <- data.frame(top5 = sort(c(unique(features_top5))))
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top5 |
|---|
| daily_num_sessions_started |
| daily_num_sessions_started_max |
| FX_PAGE_LOAD_MS_2_PARENT |
| memory_mb |
| num_active_days |
| num_addons |
| num_bookmarks |
| profile_age |
| profile_age_cat |
| session_length |
| session_length_max |
| TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_CONTENT_LOADED_END_MS |
| TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_LOAD_EVENT_END_MS |
| TIME_TO_NON_BLANK_PAINT_MS |
Increasing to 10:
features_top10 <- NULL
for(metric in names(boruta_results_eq)){
features_top10 <- c(names(sort(apply(boruta_results_eq[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10)
}
x <- data.frame(top10 = sort(c(unique(features_top10))))
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top10 |
|---|
| cpu_speed_mhz |
| daily_num_sessions_started |
| daily_num_sessions_started_max |
| default_search_engine_other (non-bundled) |
| FX_PAGE_LOAD_MS_2_PARENT |
| memory_mb |
| num_active_days |
| num_addons |
| num_bookmarks |
| profile_age |
| profile_age_cat |
| session_length |
| session_length_max |
| startup_ms |
| startup_ms_max |
| TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_CONTENT_LOADED_END_MS |
| TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_LOAD_EVENT_END_MS |
| TIME_TO_NON_BLANK_PAINT_MS |